
## This file infer race/ethnicity of inventor names using ethnicolr
## Zihao Li. 06/2024

import sys, os
import csv
import pandas as pd
import numpy as np
from ethnicolr import pred_fl_reg_name
import warnings
warnings.filterwarnings('ignore')

input_dir = r'/Volumes/Zihao_SSD2/PatentsView/'
output_dir = r'/Volumes/Zihao_SSD2/PatentsView/'
os.chdir(output_dir)

def main():
    
    df = pd.read_csv(input_dir + 'temp/g_inventor_clean_nomidname.csv', encoding='utf-8')
    batch_size = 100
    df_race = pd.DataFrame()

    for i in range(0, int(np.floor(df.shape[0]/batch_size))):
        print(i)
        df_sub = df.iloc[i*batch_size:(i+1)*batch_size]
        df_sub = pred_fl_reg_name(df_sub, 'last_name_clean', 'first_name_clean')
        df_race = pd.concat([df_race, df_sub]);   del df_sub
    
    df_race.to_csv(output_dir+'temp/g_inventor_race.csv', index=False)
    
    # the last bit
    df = pd.read_csv(input_dir + 'temp/g_inventor_clean_nomidname.csv', encoding='utf-8')
    df_race = pd.read_csv(output_dir+'temp/g_inventor_race.csv', encoding='utf-8')
    df_sub = df.iloc[3337800:3337811]
    df_sub = pred_fl_reg_name(df_sub, 'last_name_clean', 'first_name_clean')
    df_race = pd.concat([df_race, df_sub])
    df_race.to_csv(output_dir+'temp/g_inventor_race.csv', index=False)


if __name__ == "__main__":
    main()







